<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>DATTT</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML"></script>
    <script type="text/x-mathjax-config">
        MathJax.Hub.Config({
	    tex2jax: {
	        inlineMath: [['$','$'], ['\\(','\\)']],
	        processEscapes: true
	    }
	});
    </script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="xtitle is-1 publication-title">Depth-aware Test-Time Training for <br> Zero-shot Video Object Segmentation</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="AUTHOR PERSONAL LINK" target="_blank">Weihuang Liu</a><sup>1</sup></span>
                <span class="author-block">
                  <a href="https://xishen0220.github.io/" target="_blank">Xi Shen</a><sup>2</sup></span>
                <span class="author-block">
                  <a href="AUTHOR PERSONAL LINK" target="_blank">Haolun Li</a><sup>1</sup></span>
                <span class="author-block">
                  <a href="AUTHOR PERSONAL LINK" target="_blank">Xiuli Bi</a><sup>3</sup></span>
                <span class="author-block">
                  <a href="AUTHOR PERSONAL LINK" target="_blank">Bo Liu</a><sup>3</sup></span>
                <span class="author-block">
                  <a href="https://www.cis.um.edu.mo/~cmpun/" target="_blank">Chi-Man Pun</a><sup>*,1</sup></span>
		        <span class="author-block">
                  <a href="https://vinthony.github.io/" target="_blank">Xiaodong Cun</a><sup>*,4</sup></span>
                  </span>
                  </div>
                  <div class="is-size-5 publication-authors">
                      <sup>1</sup> University of Macau &nbsp;&nbsp;&nbsp;
                      <sup>2</sup> Intellindust&nbsp;
                    <br>
                      <sup>3</sup> Chongqing University of Posts and Telecommunications&nbsp;&nbsp;
                      <sup>4</sup> Tencent AI Lab &nbsp;&nbsp;&nbsp;
                      <br>CVPR 2024</span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="https://arxiv.org/pdf/2403.04258.pdf" target="_blank"
                        class="external-link ">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/NiFangBaAGe/DATTT" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2403.04258" target="_blank"
                  class="external-link ">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Zero-shot Video Object Segmentation (ZSVOS) aims at segmenting the primary moving object without any human annotations.
            Mainstream solutions mainly focus on learning a single model on large-scale video datasets, which struggle to generalize to unseen videos.
            In this work, we introduce a test-time training (TTT) strategy to address the problem.
            Our key insight is to enforce the model to predict consistent depth during the TTT process.
            In detail, we first train a single network to perform both segmentation and depth prediction tasks.
            This can be effectively learned with our specifically designed depth modulation layer.
            Then, for the TTT process, the model is updated by predicting consistent depth maps for the same frame under different data augmentations.
            In addition, we explore different TTT weight updating strategies.
            Our empirical results suggest that the momentum-based weight initialization and looping-based training scheme lead to more stable improvements.
            Experiments show that the proposed method achieves clear improvements on ZSVOS.
            Our proposed video TTT strategy provides significant superiority over state-of-the-art TTT methods.
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!-- Teaser Image-->
<section class="hero teaser">
  <div class="row" style="text-align: center; padding-left:25rem; padding-right:25rem; padding-bottom:1rem;">
    <h2 class="title is-3">Overview</h2>
    <div class="hero-body">
      <img src="static/images/teaser.jpg">
	<p>
      Our key insight is to enforce the model to predict consistent depth during the TTT process.
      During the test-time training, the model is required to predict consistent depth maps for the same video frame under different data augmentation.
      The model is progressively updated and provides more precise mask prediction.
	</p>

    </div>
  </div>
</section>
<!-- End teaser Image -->


<!-- Teaser Image-->
<section class="hero teaser">
    <h2 class="title is-3">Pipeline</h2>
  <div class="row" style="text-align: center; padding-left:25rem; padding-right:25rem; padding-bottom:1rem;">
    <div class="hero-body">
	<img src="static/images/framework.jpg">
<!--       <img src="static/images/pipeline.png"  width="50%"> -->
	<p>
	We add a depth decoder to commonly used two-stream ZSVOS architecture to learn 3D knowledge.
      The model is first trained on large-scale datasets for object segmentation and depth estimation.
      Then, for each test video, we employ photometric distortion-based data augmentation to the frames.
      The error between the predicted depth maps is backward to update the image encoder.
      Finally, the new model is applied to infer the object.
	</p>
    </div>
  </div>
</section>
<!-- End teaser Image -->


<!-- Image carousel -->
<section class="section hero is-light">
  <div class="hero-body">
      <h2 class="title is-3">Video Demo</h2>
    <div class="columns is-centered has-text-centered">
      <div class="marquee-item">
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video1"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/nottt_court.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video2"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/ttt_court.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
      </div>
      <div class="marquee-item">
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video1"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/nottt_stair.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video2"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/ttt_stair.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
      </div>
      <div class="marquee-item">
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video1"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/nottt_worm.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
        <video onloadstart="this.playbackRate = 0.5;" poster="" id="video2"  class="lozad" autoplay muted loop playsinline height="100%">
          <source src="static/videos/ttt_worm.mp4"
            type="video/mp4">
          sorry, your browser does not support HTML5 Videos.
          </video>
      </div>
  </div>
</div>
  <div class="row" style="text-align: center; padding-left:25rem; padding-right:25rem; padding-bottom:1rem;">
      	<p>
The results obtained by the pre-trained model are less accurate (top), and become better and better with our DATTT (bottom).
	</p>

</div>
</div>
</section>
<!-- End image carousel -->

<!--&lt;!&ndash; Teaser Image&ndash;&gt;-->
<!--<section class="hero teaser">-->
<!--  <div class="container is-max-desktop">-->
<!--    <h2 class="title is-3">Results</h2>-->
<!--    <div class="hero-body">-->
<!--      <img src="static/images/result.png">-->
<!--	<p>-->

<!--	</p>-->
<!--	    -->
<!--    </div>-->
<!--  </div>-->
<!--</section>-->
<!--&lt;!&ndash; End teaser Image &ndash;&gt;-->



<!-- Paper poster 
<section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <h2 class="title">Poster</h2>
      <iframe  src="static/pdfs/sample.pdf" width="100%" height="550">
          </iframe>
        
      </div>
    </div>
  </section>
End paper poster -->


<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>
      @inproceedings{
      title={Depth-aware Test-Time Training for Zero-shot Video Object Segmentation},
      author={Weihuang Liu, Xi Shen, Haolun Li, Xiuli Bi, Bo Liu, Chi-Man Pun, Xiaodong Cun},
      booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
      year={2024}
      }
      </code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/vinthony/project-page-template">modification version</a> of <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> from <a href="https://github.com/vinthony">vinthony</a>.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>
